DAM-related clusters

Author

Ricardo Martins-Ferreira

Homeostatic clusters

This script depicts the specific and in-depth characterization of the clusters related to the disease-associated microglia (DAM) signature, including gene markers, gene ontology, gene set enrichment anaysis, and transcription factor enrichment, related to Figure 3.

Load required packages

libs <- c("Seurat", "tidyverse", "clusterProfiler","viper","dorothea","org.Hs.eg.db","ggrepel","gridExtra","ggpubr","readxl","msigdbr","fgsea","data.table")
suppressMessages(
suppressWarnings(sapply(libs, require, character.only =TRUE))
)
         Seurat       tidyverse clusterProfiler           viper        dorothea 
           TRUE            TRUE            TRUE            TRUE            TRUE 
   org.Hs.eg.db         ggrepel       gridExtra          ggpubr          readxl 
           TRUE            TRUE            TRUE            TRUE            TRUE 
        msigdbr           fgsea      data.table 
           TRUE            TRUE            TRUE 

Representation of the main markers

Based on the Module Score expression of the original DAM signature from Keren-Shaul et al (2017) and the more recent DAM, DIM (disease-inflammatory macrophage) and YAM (youth-associated microglia) from Silvin et al (2022), we determined clusters 1, 2, 3, 5 and 6 as DAM-related. In addition to the mentioned Module Scores, we represented the expression of the top 10 significant markers for each cluster. The used sigantures are included at the “Genesets_literature” file in this repository (“Support data”).

Dotplot

# define gene signatures
genesets$genesets <- paste0(genesets$Population, "_",genesets$Study)

All_DAM <- genesets$Gene[genesets$genesets=="up_DAM_Keren_shaul_Thrupp"]
Human_DAM_signature <- genesets$Gene[genesets$genesets=="Human_Adult_DAMs_Silvin"]
Human_DIM_signature <- genesets$Gene[genesets$genesets=="Human_Adult_DIMs_Silvin"]
Human_YAM_signature <- genesets$Gene[genesets$genesets=="Human_Adult_YAMs_Silvin"]

# Add Module Scores
Humica <- SetIdent(Humica, value = Humica@meta.data$integrated_snn_res.0.2)
DefaultAssay(Humica)<-"RNA"
Humica <- NormalizeData(Humica)

Humica <- AddModuleScore(Humica,
                         features =list(All_DAM),
                         name='DAM.ALL')

Humica <- AddModuleScore(Humica,
                         features =list(Human_DAM_signature),
                         name='Human_DAM_Silvin')

Humica <- AddModuleScore(Humica,
                         features =list(Human_YAM_signature),
                         name='Human_YAM_Silvin')

Humica <- AddModuleScore(Humica,
                         features =list(Human_DIM_signature),
                         name='Human_DIM_Silvin')

# define top 10 markers per DAM-related cluster
DAM_markers <- sig_markers[sig_markers$cluster %in% c("1","2","3","5","6"),]
DAM_markers$avg_log2FC<- as.numeric(DAM_markers$avg_log2FC)

DAM_markers %>%
  group_by(cluster) %>% arrange(-avg_log2FC) %>% arrange(cluster) %>% 
  top_n(n = 10, wt = avg_log2FC)-> top10_up 

# DotPlot
features <- c("DAM.ALL1","Human_DAM_Silvin1","Human_YAM_Silvin1","Human_DIM_Silvin1",top10_up$gene %>% unique)

DotPlot(Humica, features =factor(features, levels = features), 
        dot.scale = 10) +
  scale_colour_gradient2(low = "darkblue", mid = "white", high = "darkred")+
  #scale_color_virgeneis_c() +
  theme(axis.text.x = element_text(angle=90, hjust = 0))

Gene ontology

Gene ontology (GO) was calculated with the enrichGO function from clusterProfiler.

Barplots

The top 10 significant GO terms for each of the homeostatic clusters were represented in barplots to provide a broader insight on the GO enrichment (related to Supplementary Figure 7B). GO was recalculated and edited to obtain the Fold Change of enrichment.

i = 0
results <- list()
go_all_simplify <- data.frame()
for(comparison in levels(factor(DAM_markers$cluster))){
  i = i+1
  print(paste(comparison))
  ego <- enrichGO(gene              = DAM_markers[DAM_markers$cluster == comparison,]$gene,
                  OrgDb             = org.Hs.eg.db,
                  ont               = "ALL",
                  keyType           = "SYMBOL",
                  pAdjustMethod     = "BH",
                  pvalueCutoff      = 1,
                  qvalueCutoff      = 0.05,
                  readable          = T )
  ego <- simplify(ego, cutoff=0.7, by="p.adjust", select_fun=min)
  
  results[[comparison]] <- ego@result %>%
    separate(GeneRatio, into = c("gene_pos", "gene_total"), sep = "/") %>%
    separate(BgRatio, into = c("bg_pos", "bg_total"), sep = "/") %>%
    mutate(FC = (as.numeric(gene_pos)/as.numeric(gene_total)) /
             (as.numeric(bg_pos)/as.numeric(bg_total)),
           cluster = comparison) %>%
    arrange(.$p.adjust)
  go_all_simplify <- rbind(go_all_simplify, results[[comparison]])
}
clusterlist <- c("1","2","3","5","6")
p<-list()
for (i in 1:length(clusterlist)) {
  data <-go_all_simplify[go_all_simplify$cluster==clusterlist[i],]
  data$logpadj <- -log10(data$p.adjust)
  data$logFC<- log10(data$FC)
  top10 <- data %>%top_n(n = 10, wt = logpadj)
  top10 <- top10[1:10,]
  
  
  p[[i]]<- ggplot(top10,
                  aes(fill=logFC,
                      x=factor(ID, level = rev(ID)),
                      y=logpadj,
                      color="black")) +
    geom_col(color="black", size=0.2) +
    geom_text(aes(label = Description), hjust=0,position = position_fill(vjust = 0), size = 4, color="black")+
    theme_bw() +
    scale_fill_gradientn(colours = c("#B9CFD4","#AFAAB9","#B48291","#8C5465"))+
    #scale_fill_viridis(option = "plasma")+
    coord_flip()
}
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
do.call(grid.arrange,p)

Gene set enrichment analysis (GSEA)

GSEA analysis was performed using as reference all genesets from msigdbr for the “C2” category of “Homo sapiens”. The results from FindAllMarkers with a less significant cutoff were used as cutoff. This intends to account for the broad spectrum of differential expression and not only the significant differentially expressed markers (only.pos = F, min.pct = 0.1, logfc.threshold = 0.0, test.use = “MAST”). Of note, both up and downregulated markers were considered at “Humica.markers”. The avg_log2FC and the p_val_adj were used for ranking.

all_gene_sets = msigdbr(species = "Homo sapiens",category = c("C2"))

msigdbr_list = split(x = all_gene_sets$gene_symbol, f = all_gene_sets$gs_name)

fgsea_ALL <- data.frame()
for (i in 1:length(clusterlist)) {
data <- Humica_markers[Humica_markers$cluster==clusterlist[i],]  
data2 <- structure(data$avg_log2FC, names=data$gene)
df <- fgsea(pathways =msigdbr_list, 
            stats    = data2,
            scoreType ="pos",
            minSize  = 1,
            maxSize  = Inf)


df$cluster <- clusterlist[i]

fgsea_ALL <- rbind(fgsea_ALL, df)
}

Heatmap representation of the results

A list of terms of interested were selected from the results output. The results are presented as a function of the Normalized Enrichment Score (NES) and the negative of the log of the adjusted p value.

## with - LOG adj p value
fgsea_res$log <- -log(fgsea_res$padj)
fgsea_res <- fgsea_res[order(fgsea_res$log),]

fgsea_res$NES<- fgsea_res$NES %>% as.numeric()

fgsea_res$pathway2 <- paste0(fgsea_res$pathway,"_",fgsea_res$cluster)

ggplot(fgsea_res, aes(factor(pathway2, levels=pathway2), log, fill=NES))+
  geom_bar(stat = "identity")+
  geom_text(aes(label = pathway2),color="black", hjust=0,position = position_fill(vjust = 0.2), size = 4)+
  scale_fill_gradientn(colours = c("#F0FFCE","#CCC9A1","#CD543C"))+
  #scale_size_continuous(range = c(6, 14))+
  coord_flip()+
  facet_grid(cluster~., shrink = TRUE, scales = "free", space="free")+
  theme_bw()+
  theme(axis.text.y  = element_blank())

Transcription factor enrichment

The collection of transcription factors (TFs) and their target genes was obtained from the CollecTRI repository from decoupleR. The enrichment analysis was performed using the viper package. The Humica.markers file correspond to the broad spectrum differential expression output, mentioned in the “Characterization of the HuMicA” script of this repository. The following represents the loop for the calculation of TF enrichment in all nine clusters.

#Create the regulon from the CollecTRI repository in the appropriate format for viper.
net <- decoupleR::get_collectri(organism='human', split_complexes=FALSE)

## create regulon for viper
# Function to create named tfmode and add likelihood
create_named_tfmode <- function(df) {
  # Ensure tfmode is numeric
  tfmode_named <- setNames(as.numeric(df$mor), df$target)
  likelihood_vector <- rep(1, length(tfmode_named))  # Assuming constant likelihood of 1
  list(tfmode = tfmode_named, likelihood = likelihood_vector)
}

# Group by source and apply the transformation
source_target_list <- net %>%
  group_by(source) %>%
  summarise(
    tfmode = list(setNames(as.numeric(mor), target)),
    likelihood = list(rep(1, n()))
  ) %>%
  deframe()
Warning: `x` must be a one- or two-column data frame in `deframe()`.
regulon_TRI <- source_target_list
list <- unique(Humica_markers$cluster)
TF_activities_all <- data.frame()

for (i in 1:length(list)) {

data <- Humica_markers[Humica_markers$cluster==list[i],]

#Exclude probes with unknown or duplicated gene symbol
DEsignature = subset(data, gene != "" )

DEsignature = subset(DEsignature, ! duplicated(gene))

# Estimatez-score values for the GES. Cheeck VIPER manual for details

myStatistics = matrix(DEsignature$avg_log2FC, dimnames = list(DEsignature$gene,
                                                              
                                                              'avg_log2FC') )

myPvalue = matrix(DEsignature$p_val_adj, dimnames = list(DEsignature$gene, 'padj') )

mySignature = (qnorm(myPvalue/2, lower.tail = FALSE) * sign(myStatistics))[, 1]

mySignature = mySignature[order(mySignature, decreasing = T)]

# Estimate TF activities

mrs = msviper(ges = mySignature, regulon = regulon_TRI,minsize = 25, ges.filter = F)

TF_activities = data.frame(Regulon = names(mrs$es$nes),
                           
                           Size = mrs$es$size[ names(mrs$es$nes) ],
                           
                           NES = mrs$es$nes,
                           
                           p.value = mrs$es$p.value,
                           
                           FDR = p.adjust(mrs$es$p.value, method = 'fdr'))

TF_activities = TF_activities[ order(TF_activities$p.value), ]
TF_activities$cluster <- list[i]
TF_activities_all<- rbind(TF_activities_all, TF_activities)
}

Dotplot

The plot depicts the enrichment for all clusters in the HuMicA but only considers the significant TFs for the three homeostatic clusters is considered for representation. The statistically significant TFs were considered for an adj p value (FDR) < 0.05.

TF_activities_DAM <- TF_activities_all[TF_activities_all$cluster %in% c("1","2","3","5","6"),]

TF_activities_DAM_sig <- TF_activities_DAM[TF_activities_DAM$FDR < 0.05,]

#Plot TF enrichment results
TF_activities_all <- TF_activities_all%>% 
  mutate(logFDR = -log(FDR))

common_features <- TF_activities_DAM_sig$Regulon %>% unique()

ggplot(TF_activities_all %>% dplyr::filter(Regulon %in% common_features), aes(as.factor(cluster), y= factor(Regulon, levels = common_features),color=NES,size =logFDR)) +
  geom_point() +
  scale_color_gradient2(low="#47663D",mid = "white",high="#B86B00")+
  theme_pubr(border = 1)+ 
  coord_flip()+
  theme(axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5))